library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(countrycode)
theme_HS<-list(theme(plot.title = element_text(lineheight=1, size=15, face="bold"), #set parameters of title
plot.subtitle = element_text(lineheight=1, size=12, face="bold"), # set parameters of subtitle
plot.caption = element_text(lineheight=1, size=13, hjust=1), # set parameters of caption
legend.title = element_blank (), # legend title (we don't want a title for the legens)
legend.text = element_text(colour="black", size = 15), # set parameters of legend text
legend.position="bottom", # set position of legend
# legend.justification=c(1,0), # set justification of legend
legend.background = element_rect(fill=NA, colour = NA), # set legend background
legend.key.size = unit(1.5, 'lines'), # set size of simbols of the legend
legend.key = element_rect(colour = NA, fill = NA), #set background of simbols in the legend
axis.title.x = element_blank (), # set x axis title (we don't want a title for the legend)
axis.text.x = element_text(angle = 0,vjust=0.5, size=15,colour="black"), #set parameters of x axis text
axis.title.y = element_text(vjust=2, size=15,colour="black"), # set y axis title
axis.text.y = element_text(vjust=0.5, size=15,colour="black"), #set parameters of y axis text
strip.text = element_text(size=15, face="bold"), #text for facets
plot.background = element_rect(fill = "white"), # set color of the background of the plot
panel.grid.major=element_line(colour="#E6E6E6",linewidth=.5), # set color of major grid lines
panel.grid.minor=element_line(colour="#E6E6E6",linewidth=.15), # set color of minor grid lines
panel.border = element_rect(colour = "#585858", fill=NA, linewidth=.75), #set color of panel border line
panel.background =element_rect(fill ="#FFFFFF", colour = "#FFFFFF"))) # set color of the background of the panel of the plot
icpc_data <- read.csv("data/icpc-full.csv")
icpc_data$Year <- as.numeric(icpc_data$Year)
participation_summary <- icpc_data %>%
group_by(Year) %>%
summarize(
num_countries = n_distinct(Country),
num_teams = n_distinct(Team)
)
head(participation_summary)
## # A tibble: 6 × 3
## Year num_countries num_teams
## <dbl> <int> <int>
## 1 1999 21 62
## 2 2000 27 60
## 3 2001 27 64
## 4 2002 27 64
## 5 2003 25 68
## 6 2004 31 73
plot <- ggplot(participation_summary, aes(x = Year, y = num_countries)) +
geom_line(color = "steelblue", size = 1) +
geom_point(color = "steelblue", size = 2) +
annotate("rect", xmin = 2020, xmax = Inf, ymin = -Inf, ymax = Inf, alpha = 0.3, fill = "red") +
annotate("text", x = 2020, y = 55,
label = "COVID-19\nPandemic", vjust = 1, hjust = -0.2, color = "red", size = 5, bold = TRUE) +
scale_x_continuous(breaks = seq(1999, 2023, by = 3), limits = c(1999, 2023)) +
labs(
title = "Growth in Number of Participating Countries from 1999 to present",
x = "Year",
y = "Number of Countries"
) +
theme_HS
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning in annotate("text", x = 2020, y = 55, label = "COVID-19\nPandemic", :
## Ignoring unknown parameters: `bold`
plot

ggsave("plot.png", plot, width = 10, height = 6, units = "in", dpi = 300)
plot <- ggplot(participation_summary, aes(x = Year, y = num_teams)) +
geom_line(color = "darkorange", size = 1) +
geom_point(color = "darkorange", size = 2) +
annotate("rect", xmin = 2020, xmax = Inf, ymin = -Inf, ymax = Inf, alpha = 0.3, fill = "red") +
annotate("text", x = 2020, y = 150,
label = "COVID-19\nPandemic", vjust = 1, hjust = -0.2, color = "red", size = 5, bold = TRUE) +
scale_x_continuous(breaks = seq(1999, 2023, by = 3), limits = c(1999, 2023)) +
labs(
title = "Growth in Number of Participating Teams from 1999 to present",
x = "Year",
y = "Number of Teams"
) +
theme_HS
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning in annotate("text", x = 2020, y = 150, label = "COVID-19\nPandemic", :
## Ignoring unknown parameters: `bold`
plot

ggsave("plot2.png", plot, width = 10, height = 6, units = "in", dpi = 300)
icpc_data <- icpc_data %>%
mutate(Gold = as.logical(Gold),
Silver = as.logical(Silver),
Bronze = as.logical(Bronze))
icpc_data <- icpc_data %>%
mutate(Total_Medals = Gold + Silver + Bronze)
print(sum(icpc_data$Total_Medals))
## [1] 317
university_medals <- icpc_data %>%
group_by(University, Country) %>%
summarize(Total_Medals = sum(Total_Medals, na.rm = TRUE)) %>%
arrange(desc(Total_Medals)) %>%
head(10) %>%
mutate(University_Country = paste(University, " (", Country, ")", sep = ""))
## `summarise()` has grouped output by 'University'. You can override using the
## `.groups` argument.
country_medals <- icpc_data %>%
group_by(Country) %>%
summarize(Total_Medals = sum(Total_Medals, na.rm = TRUE)) %>%
arrange(desc(Total_Medals)) %>%
head(10)
print(university_medals)
## # A tibble: 10 × 4
## # Groups: University [10]
## University Country Total_Medals University_Country
## <chr> <chr> <int> <chr>
## 1 Tsinghua University China 16 Tsinghua Universi…
## 2 University of Warsaw Poland 16 University of War…
## 3 Moscow State University Russia 15 Moscow State Univ…
## 4 Massachusetts Institute of Technology United… 14 Massachusetts Ins…
## 5 Shanghai Jiao Tong University China 14 Shanghai Jiao Ton…
## 6 St. Petersburg State University Russia 13 St. Petersburg St…
## 7 University of Waterloo Canada 12 University of Wat…
## 8 The University of Tokyo Japan 11 The University of…
## 9 National Taiwan University Taiwan 8 National Taiwan U…
## 10 Peking University China 8 Peking University…
print(country_medals)
## # A tibble: 10 × 2
## Country Total_Medals
## <chr> <int>
## 1 Russia 87
## 2 China 53
## 3 United States 41
## 4 Poland 23
## 5 Canada 17
## 6 Japan 14
## 7 South Korea 11
## 8 Taiwan 8
## 9 Ukraine 8
## 10 Belarus 6
icpc_data <- icpc_data %>%
mutate(Region = countrycode(Country, "country.name", "region"),
Continent = countrycode(Country, "country.name", "continent"))
south_america_regions <- c("Latin America & Caribbean")
north_america_regions <- c("North America")
head(icpc_data)
## Year Date Host City Venue Rank
## 1 1999 1999-04-11 Netherlands Eindhoven Eindhoven University of Technology 1
## 2 1999 1999-04-11 Netherlands Eindhoven Eindhoven University of Technology 2
## 3 1999 1999-04-11 Netherlands Eindhoven Eindhoven University of Technology 3
## 4 1999 1999-04-11 Netherlands Eindhoven Eindhoven University of Technology 4
## 5 1999 1999-04-11 Netherlands Eindhoven Eindhoven University of Technology 5
## 6 1999 1999-04-11 Netherlands Eindhoven Eindhoven University of Technology 6
## University Country
## 1 University of Waterloo Canada
## 2 Albert-Ludwigs-Universität Freiburg Germany
## 3 St. Petersburg Institute of Fine Mechanics & Optics Russia
## 4 University of Bucharest Romania
## 5 Duke University United States
## 6 California Polytechnic State University United States
## Team Contestant.1 Contestant.2 Contestant.3 Gold
## 1 U Waterloo David Kennedy Ondrej Lhotak Viet-Trung Luu TRUE
## 2 Freiburg B Team FALSE
## 3 St. Petersburg IFMO Alexander Volkov Matvey Kazakov Vladimir Lyovkin FALSE
## 4 UNIBUC Bucharest FALSE
## 5 Duke Blue Devils FALSE
## 6 Cal Poly Gold FALSE
## Silver Bronze Honorable Score Total Score.Percentage Penalty Prize
## 1 FALSE FALSE False 6 8 0.75 948 World Champion
## 2 TRUE FALSE False 6 8 0.75 992
## 3 FALSE TRUE False 6 8 0.75 1046
## 4 FALSE TRUE False 6 8 0.75 1048
## 5 FALSE TRUE False 6 8 0.75 1337
## 6 FALSE TRUE False 5 8 0.62 724
## Total_Medals Region Continent
## 1 1 North America Americas
## 2 1 Europe & Central Asia Europe
## 3 1 Europe & Central Asia Europe
## 4 1 Europe & Central Asia Europe
## 5 1 North America Americas
## 6 1 North America Americas
print(unique(icpc_data$Region))
## [1] "North America" "Europe & Central Asia"
## [3] "East Asia & Pacific" "Latin America & Caribbean"
## [5] "South Asia" "Middle East & North Africa"
## [7] "Sub-Saharan Africa"
icpc_data <- icpc_data %>%
mutate(Continent = case_when(
Region %in% south_america_regions ~ "South America",
Region %in% north_america_regions ~ "North America",
TRUE ~ Continent
))
continent_medals <- icpc_data %>%
group_by(Continent) %>%
summarize(Total_Medals = sum(Total_Medals, na.rm = TRUE)) %>%
arrange(desc(Total_Medals))
plot <- ggplot(university_medals, aes(x = reorder(University_Country, Total_Medals), y = Total_Medals)) +
geom_bar(stat = "identity", fill = "purple") +
coord_flip() +
scale_y_continuous(breaks = seq(0, 16, by = 2)) +
labs(
title = "Top 10 Universities with Most Medals",
y = "Total Medals",
x = NULL
) +
theme_HS
plot

ggsave("plot3.png", plot, width = 12, height = 6, units = "in", dpi = 300)
print(max(country_medals$Total_Medals))
## [1] 87
plot <- ggplot(country_medals, aes(x = reorder(Country, Total_Medals), y = Total_Medals)) +
geom_bar(stat = "identity", fill = "darkorange") +
scale_y_continuous(breaks = seq(0, 87, by = 5)) +
coord_flip() +
labs(
title = "Top 10 Countries with Most Medals",
y = "Total Medals",
x = NULL
) +
theme_HS
plot

ggsave("plot4.png", plot, width = 12, height = 6, units = "in", dpi = 300)
print(max(continent_medals$Total_Medals))
## [1] 156
plot <- ggplot(continent_medals, aes(x = reorder(Continent, Total_Medals), y = Total_Medals)) +
geom_bar(stat = "identity", fill = "forestgreen") +
coord_flip() +
scale_y_continuous(breaks = seq(0, 156, by = 12)) +
labs(
title = "Total Medals by Continent",
x = NULL,
y = "Total Medals"
) +
theme_HS
plot

ggsave("plot5.png", plot, width = 12, height = 6, units = "in", dpi = 300)
total_medals <- sum(continent_medals$Total_Medals)
print(total_medals)
## [1] 317
library(broom)
library(tidyr)
icpc_data <- read.csv("data/icpc-full.csv")
icpc_data <- icpc_data %>%
mutate(Gold = as.logical(Gold),
Silver = as.logical(Silver),
Bronze = as.logical(Bronze))
icpc_data <- icpc_data %>%
mutate(Total_Medals = Gold + Silver + Bronze)
country_year_medals <- icpc_data %>%
group_by(Country, Year) %>%
summarize(Total_Medals = sum(Total_Medals, na.rm = TRUE)) %>%
ungroup()
## `summarise()` has grouped output by 'Country'. You can override using the
## `.groups` argument.
all_years <- seq(min(country_year_medals$Year), max(country_year_medals$Year))
all_countries <- unique(country_year_medals$Country)
all_combinations <- expand.grid(Country = all_countries, Year = all_years)
country_year_medals_complete <- all_combinations %>%
left_join(country_year_medals, by = c("Country", "Year")) %>%
mutate(Total_Medals = ifelse(is.na(Total_Medals), 0, Total_Medals))
get_slope <- function(data) {
model <- lm(Total_Medals ~ Year, data = data)
slope <- coef(model)["Year"]
return(slope)
}
country_slopes <- country_year_medals_complete %>%
group_by(Country) %>%
summarize(Slope = get_slope(cur_data_all())) %>%
arrange(desc(Slope))
## Warning: There was 1 warning in `summarize()`.
## ℹ In argument: `Slope = get_slope(cur_data_all())`.
## ℹ In group 1: `Country = "Afghanistan"`.
## Caused by warning:
## ! `cur_data_all()` was deprecated in dplyr 1.1.0.
## ℹ Please use `pick()` instead.
top_4_countries <- country_slopes %>%
top_n(4, Slope) %>%
pull(Country)
top_4_countries_data <- country_year_medals_complete %>%
filter(Country %in% top_4_countries)
top_4_countries_data$Country <- factor(top_4_countries_data$Country, levels = top_4_countries)
facet_plot <- ggplot(top_4_countries_data, aes(x = Year, y = Total_Medals)) +
geom_line() +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "red") +
scale_y_continuous(breaks = seq(0, 2, by = 1), limits = c(0, 2)) +
scale_x_continuous(breaks = seq(1999, 2023, by = 4), limits = c(1999, 2023)) +
facet_wrap(~ Country, scales = "free_y") +
labs(
title = "Medals Over Years for Top 4 Countries with Highest Slopes",
x = NULL,
y = "Total Medals"
) + theme_HS
top_10_countries <- country_slopes %>%
top_n(10, Slope)
bar_plot <- ggplot(top_10_countries, aes(x = reorder(Country, Slope), y = Slope)) +
geom_bar(stat = "identity", fill = "dodgerblue") +
coord_flip() +
scale_y_continuous(breaks = seq(0, 0.4, by = 0.01)) +
labs(
title = "Top 10 Countries by Linear Regression Slope",
subtitle = "Indicating the Performance Trend Over Years",
x = NULL,
y = "Slope (Performance Trend)"
) + theme_HS
facet_plot
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 20 rows containing missing values (`geom_smooth()`).

bar_plot

ggsave("plot7.png", facet_plot, width = 12, height = 8, units = "in", dpi = 300)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 20 rows containing missing values (`geom_smooth()`).
ggsave("plot6.png", bar_plot, width = 12, height = 8, units = "in", dpi = 300)
icpc_data <- read.csv("data/icpc-full.csv")
head(icpc_data)
## Year Date Host City Venue Rank
## 1 1999 1999-04-11 Netherlands Eindhoven Eindhoven University of Technology 1
## 2 1999 1999-04-11 Netherlands Eindhoven Eindhoven University of Technology 2
## 3 1999 1999-04-11 Netherlands Eindhoven Eindhoven University of Technology 3
## 4 1999 1999-04-11 Netherlands Eindhoven Eindhoven University of Technology 4
## 5 1999 1999-04-11 Netherlands Eindhoven Eindhoven University of Technology 5
## 6 1999 1999-04-11 Netherlands Eindhoven Eindhoven University of Technology 6
## University Country
## 1 University of Waterloo Canada
## 2 Albert-Ludwigs-Universität Freiburg Germany
## 3 St. Petersburg Institute of Fine Mechanics & Optics Russia
## 4 University of Bucharest Romania
## 5 Duke University United States
## 6 California Polytechnic State University United States
## Team Contestant.1 Contestant.2 Contestant.3 Gold
## 1 U Waterloo David Kennedy Ondrej Lhotak Viet-Trung Luu True
## 2 Freiburg B Team False
## 3 St. Petersburg IFMO Alexander Volkov Matvey Kazakov Vladimir Lyovkin False
## 4 UNIBUC Bucharest False
## 5 Duke Blue Devils False
## 6 Cal Poly Gold False
## Silver Bronze Honorable Score Total Score.Percentage Penalty Prize
## 1 False False False 6 8 0.75 948 World Champion
## 2 True False False 6 8 0.75 992
## 3 False True False 6 8 0.75 1046
## 4 False True False 6 8 0.75 1048
## 5 False True False 6 8 0.75 1337
## 6 False True False 5 8 0.62 724
total_teams_per_year <- icpc_data %>%
group_by(Year) %>%
summarize(Total_Teams_Year = n())
ranking_data <- icpc_data %>%
group_by(Country, Year) %>%
summarize(Average_Rank = ifelse(all(is.na(Rank)), NA, mean(Rank, na.rm = TRUE))) %>%
ungroup()
## `summarise()` has grouped output by 'Country'. You can override using the
## `.groups` argument.
ranking_data <- ranking_data %>%
left_join(total_teams_per_year, by = "Year")
ranking_data <- ranking_data %>%
mutate(Average_Rank = ifelse(is.na(Average_Rank), Total_Teams_Year, Average_Rank))
print(ranking_data)
## # A tibble: 884 × 4
## Country Year Average_Rank Total_Teams_Year
## <chr> <int> <dbl> <int>
## 1 Afghanistan 2020 115 117
## 2 Argentina 1999 38 62
## 3 Argentina 2000 27 60
## 4 Argentina 2001 51 64
## 5 Argentina 2002 10 64
## 6 Argentina 2003 12 68
## 7 Argentina 2004 53 73
## 8 Argentina 2005 44 78
## 9 Argentina 2006 56 83
## 10 Argentina 2007 14 87
## # ℹ 874 more rows
ranking_data <- ranking_data %>%
mutate(Scaled_Rank = (Average_Rank / Total_Teams_Year) * 100)
all_years <- seq(min(ranking_data$Year), max(ranking_data$Year))
all_countries <- unique(ranking_data$Country)
all_combinations <- expand.grid(Country = all_countries, Year = all_years)
ranking_data_complete <- all_combinations %>%
left_join(ranking_data, by = c("Country", "Year")) %>%
mutate(Scaled_Rank = ifelse(is.na(Scaled_Rank), 100, Scaled_Rank))
get_slope <- function(data) {
model <- lm(Scaled_Rank ~ Year, data = data)
slope <- coef(model)["Year"]
return(slope)
}
country_slopes <- ranking_data_complete %>%
group_by(Country) %>%
summarize(Slope = get_slope(cur_data_all())) %>%
arrange(Slope)
## Warning: There was 1 warning in `summarize()`.
## ℹ In argument: `Slope = get_slope(cur_data_all())`.
## ℹ In group 1: `Country = "Afghanistan"`.
## Caused by warning:
## ! `cur_data_all()` was deprecated in dplyr 1.1.0.
## ℹ Please use `pick()` instead.
print(country_slopes)
## # A tibble: 78 × 2
## Country Slope
## <chr> <dbl>
## 1 Switzerland -4.12
## 2 United Kingdom -3.64
## 3 Serbia -3.05
## 4 Indonesia -2.63
## 5 France -2.54
## 6 Viet Nam -2.33
## 7 Ukraine -2.24
## 8 Kazakhstan -2.19
## 9 Cuba -2.10
## 10 Latvia -1.77
## # ℹ 68 more rows
top_10_countries <- country_slopes %>%
top_n(10, -Slope)
top_4_countries <- country_slopes %>%
top_n(4, -Slope) %>%
pull(Country)
top_4_countries_data <- ranking_data_complete %>%
filter(Country %in% top_4_countries)
top_4_countries_data$Country <- factor(top_4_countries_data$Country, levels = top_4_countries)
print(top_10_countries)
## # A tibble: 10 × 2
## Country Slope
## <chr> <dbl>
## 1 Switzerland -4.12
## 2 United Kingdom -3.64
## 3 Serbia -3.05
## 4 Indonesia -2.63
## 5 France -2.54
## 6 Viet Nam -2.33
## 7 Ukraine -2.24
## 8 Kazakhstan -2.19
## 9 Cuba -2.10
## 10 Latvia -1.77
print(top_4_countries_data)
## Country Year Average_Rank Total_Teams_Year Scaled_Rank
## 1 Indonesia 1999 NA NA 100.000000
## 2 Serbia 1999 NA NA 100.000000
## 3 Switzerland 1999 NA NA 100.000000
## 4 United Kingdom 1999 NA NA 100.000000
## 5 Indonesia 2000 NA NA 100.000000
## 6 Serbia 2000 NA NA 100.000000
## 7 Switzerland 2000 NA NA 100.000000
## 8 United Kingdom 2000 NA NA 100.000000
## 9 Indonesia 2001 NA NA 100.000000
## 10 Serbia 2001 NA NA 100.000000
## 11 Switzerland 2001 NA NA 100.000000
## 12 United Kingdom 2001 NA NA 100.000000
## 13 Indonesia 2002 NA NA 100.000000
## 14 Serbia 2002 NA NA 100.000000
## 15 Switzerland 2002 NA NA 100.000000
## 16 United Kingdom 2002 NA NA 100.000000
## 17 Indonesia 2003 NA NA 100.000000
## 18 Serbia 2003 NA NA 100.000000
## 19 Switzerland 2003 NA NA 100.000000
## 20 United Kingdom 2003 NA NA 100.000000
## 21 Indonesia 2004 NA NA 100.000000
## 22 Serbia 2004 NA NA 100.000000
## 23 Switzerland 2004 NA NA 100.000000
## 24 United Kingdom 2004 NA NA 100.000000
## 25 Indonesia 2005 NA NA 100.000000
## 26 Serbia 2005 NA NA 100.000000
## 27 Switzerland 2005 NA NA 100.000000
## 28 United Kingdom 2005 NA NA 100.000000
## 29 Indonesia 2006 NA NA 100.000000
## 30 Serbia 2006 NA NA 100.000000
## 31 Switzerland 2006 NA NA 100.000000
## 32 United Kingdom 2006 NA NA 100.000000
## 33 Indonesia 2007 57.0 87 65.517241
## 34 Serbia 2007 NA NA 100.000000
## 35 Switzerland 2007 NA NA 100.000000
## 36 United Kingdom 2007 NA NA 100.000000
## 37 Indonesia 2008 55.0 100 55.000000
## 38 Serbia 2008 NA NA 100.000000
## 39 Switzerland 2008 NA NA 100.000000
## 40 United Kingdom 2008 29.0 100 29.000000
## 41 Indonesia 2009 100.0 100 100.000000
## 42 Serbia 2009 NA NA 100.000000
## 43 Switzerland 2009 NA NA 100.000000
## 44 United Kingdom 2009 15.0 100 15.000000
## 45 Indonesia 2010 NA NA 100.000000
## 46 Serbia 2010 NA NA 100.000000
## 47 Switzerland 2010 NA NA 100.000000
## 48 United Kingdom 2010 NA NA 100.000000
## 49 Indonesia 2011 NA NA 100.000000
## 50 Serbia 2011 NA NA 100.000000
## 51 Switzerland 2011 53.0 105 50.476190
## 52 United Kingdom 2011 NA NA 100.000000
## 53 Indonesia 2012 56.0 112 50.000000
## 54 Serbia 2012 NA NA 100.000000
## 55 Switzerland 2012 NA NA 100.000000
## 56 United Kingdom 2012 NA NA 100.000000
## 57 Indonesia 2013 23.0 118 19.491525
## 58 Serbia 2013 NA NA 100.000000
## 59 Switzerland 2013 31.0 118 26.271186
## 60 United Kingdom 2013 NA NA 100.000000
## 61 Indonesia 2014 98.5 120 82.083333
## 62 Serbia 2014 NA NA 100.000000
## 63 Switzerland 2014 65.0 120 54.166667
## 64 United Kingdom 2014 54.0 120 45.000000
## 65 Indonesia 2015 79.0 126 62.698413
## 66 Serbia 2015 NA NA 100.000000
## 67 Switzerland 2015 66.0 126 52.380952
## 68 United Kingdom 2015 NA NA 100.000000
## 69 Indonesia 2016 NA NA 100.000000
## 70 Serbia 2016 NA NA 100.000000
## 71 Switzerland 2016 71.0 128 55.468750
## 72 United Kingdom 2016 92.0 128 71.875000
## 73 Indonesia 2017 67.5 133 50.751880
## 74 Serbia 2017 NA NA 100.000000
## 75 Switzerland 2017 34.0 133 25.563910
## 76 United Kingdom 2017 106.0 133 79.699248
## 77 Indonesia 2018 41.0 140 29.285714
## 78 Serbia 2018 NA NA 100.000000
## 79 Switzerland 2018 29.0 140 20.714286
## 80 United Kingdom 2018 43.5 140 31.071429
## 81 Indonesia 2019 65.0 135 48.148148
## 82 Serbia 2019 42.0 135 31.111111
## 83 Switzerland 2019 43.0 135 31.851852
## 84 United Kingdom 2019 17.0 135 12.592593
## 85 Indonesia 2020 32.0 117 27.350427
## 86 Serbia 2020 24.0 117 20.512821
## 87 Switzerland 2020 40.0 117 34.188034
## 88 United Kingdom 2020 6.0 117 5.128205
## 89 Indonesia 2021 63.5 132 48.106061
## 90 Serbia 2021 36.0 132 27.272727
## 91 Switzerland 2021 5.0 132 3.787879
## 92 United Kingdom 2021 14.5 132 10.984848
## 93 Indonesia 2022 78.0 124 62.903226
## 94 Serbia 2022 15.0 124 12.096774
## 95 Switzerland 2022 44.0 124 35.483871
## 96 United Kingdom 2022 32.0 124 25.806452
## 97 Indonesia 2023 77.0 130 59.230769
## 98 Serbia 2023 21.0 130 16.153846
## 99 Switzerland 2023 38.0 130 29.230769
## 100 United Kingdom 2023 28.0 130 21.538462
facet_plot <- ggplot(top_4_countries_data, aes(x = Year, y = Scaled_Rank)) +
geom_line() +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "red") +
scale_y_continuous(breaks = seq(0, 100, by = 25), limits = c(0, 100)) +
scale_x_continuous(breaks = seq(1999, 2023, by = 4), limits = c(1999, 2023)) +
facet_wrap(~ Country, scales = "free_y") +
labs(
title = "Ranking Over Years for Top 4 Countries with Lowest Slopes",
x = NULL,
y = "Scaled Rank"
) + theme_HS
bar_plot <- ggplot(top_10_countries, aes(x = reorder(Country, -Slope), y = Slope)) +
geom_bar(stat = "identity", fill = "brown") +
coord_flip() +
labs(
title = "Top 10 Countries by Linear Regression Slope",
subtitle = "Indicating the Performance Trend Over Years",
x = NULL,
y = "Slope (Performance Trend)"
) + theme_HS
facet_plot
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 59 rows containing missing values (`geom_smooth()`).

bar_plot

ggsave("plot9.png", facet_plot, width = 12, height = 8, units = "in", dpi = 300)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 59 rows containing missing values (`geom_smooth()`).
ggsave("plot8.png", bar_plot, width = 12, height = 8, units = "in", dpi = 300)
cuba_data <- ranking_data_complete %>%
filter(Country == "Cuba")
cuba_plot <- ggplot(cuba_data, aes(x = Year, y = Scaled_Rank)) +
geom_line() +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "red") +
scale_y_continuous(breaks = seq(0, 100, by = 25), limits = c(0, 100)) +
scale_x_continuous(breaks = seq(1999, 2023, by = 4), limits = c(1999, 2023)) +
labs(
title = "Ranking Over Years for Cuba",
x = NULL,
y = "Scaled Rank"
) + theme_HS
cuba_plot
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 17 rows containing missing values (`geom_smooth()`).

ggsave("plot10.png", cuba_plot, width = 12, height = 6, units = "in", dpi = 300)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 17 rows containing missing values (`geom_smooth()`).
library(leaflet)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(readr)
library(htmltools)
library(sf)
## Linking to GEOS 3.10.2, GDAL 3.4.1, PROJ 8.2.1; sf_use_s2() is TRUE
world <- st_read("data/world.shp")
## Reading layer `world' from data source
## `/home/brayand/Storage/School/from-data-to-knowledge-interpretation-visualization-presentation-course/Final Project/data/world.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 197 features and 63 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -180 ymin: -55.8917 xmax: 180 ymax: 83.59961
## Geodetic CRS: GCS_unknown
data <- read_csv("data/icpc-full.csv")
## Rows: 2562 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): Host, City, Venue, University, Country, Team, Contestant 1, Conte...
## dbl (6): Year, Rank, Score, Total, Score Percentage, Penalty
## lgl (4): Gold, Silver, Bronze, Honorable
## date (1): Date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data$Host[data$Host == "Czechia"] <- "Czech Rep."
data$Country[data$Country == "Czechia"] <- "Czech Rep."
host_counts <- data %>%
select(Year, Host) %>%
distinct() %>%
group_by(Host) %>%
summarise(Count = n()) %>%
ungroup()
total_teams_per_year <- icpc_data %>%
group_by(Year) %>%
summarize(Total_Teams_Year = n())
ranking_data <- icpc_data %>%
group_by(Country, Year) %>%
summarize(Average_Rank = ifelse(all(is.na(Rank)), NA, mean(Rank, na.rm = TRUE))) %>%
ungroup()
## `summarise()` has grouped output by 'Country'. You can override using the
## `.groups` argument.
ranking_data <- ranking_data %>%
left_join(total_teams_per_year, by = "Year")
ranking_data <- ranking_data %>%
mutate(Average_Rank = ifelse(is.na(Average_Rank), Total_Teams_Year, Average_Rank))
print(ranking_data)
## # A tibble: 884 × 4
## Country Year Average_Rank Total_Teams_Year
## <chr> <int> <dbl> <int>
## 1 Afghanistan 2020 115 117
## 2 Argentina 1999 38 62
## 3 Argentina 2000 27 60
## 4 Argentina 2001 51 64
## 5 Argentina 2002 10 64
## 6 Argentina 2003 12 68
## 7 Argentina 2004 53 73
## 8 Argentina 2005 44 78
## 9 Argentina 2006 56 83
## 10 Argentina 2007 14 87
## # ℹ 874 more rows
ranking_data <- ranking_data %>%
mutate(Scaled_Rank = (Average_Rank / Total_Teams_Year) * 100)
all_years <- seq(min(ranking_data$Year), max(ranking_data$Year))
all_countries <- unique(ranking_data$Country)
all_combinations <- expand.grid(Country = all_countries, Year = all_years)
ranking_data_complete <- all_combinations %>%
left_join(ranking_data, by = c("Country", "Year")) %>%
mutate(Scaled_Rank = ifelse(is.na(Scaled_Rank), 100, Scaled_Rank))
average_ranking <- ranking_data_complete %>%
group_by(Country) %>%
summarize(Average_Rank = mean(Scaled_Rank, na.rm = TRUE)) %>%
arrange(Average_Rank)
world <- world %>%
left_join(host_counts, by = c("name" = "Host")) %>%
left_join(average_ranking, by = c("name" = "Country"))
world$Count[is.na(world$Count)] <- 0
pal_host <- colorNumeric("YlOrRd", domain = world$Count)
pal_rank <- colorNumeric("YlGnBu", domain = world$Average_Rank, reverse = TRUE)
total_hosted <- sum(world$Count)
International Collegiate Programming Contest (ICPC) Heatmap of Host
Countries
map_host <- leaflet(data = world) %>%
addTiles() %>%
addPolygons(
fillColor = ~pal_host(Count),
weight = 1,
opacity = 1,
color = "white",
dashArray = "3",
fillOpacity = 0.7,
highlight = highlightOptions(
weight = 3,
color = "#666",
dashArray = "",
fillOpacity = 0.7,
bringToFront = TRUE
),
label = ~paste(name, ": ", Count, " times hosted"),
labelOptions = labelOptions(
style = list("font-weight" = "normal", padding = "3px 8px"),
textsize = "15px",
direction = "auto"
)
) %>%
addLegend(
pal = pal_host,
values = ~Count,
opacity = 0.7,
title = "Times Hosted",
position = "bottomright"
)
map_host
International Collegiate Programming Contest (ICPC) Heatmap of
Ranking Average
world <- world %>%
filter(!is.na(Average_Rank))
map_rank <- leaflet(data = world) %>%
addTiles() %>%
addPolygons(
fillColor = ~pal_rank(Average_Rank),
weight = 1,
opacity = 1,
color = "white",
dashArray = "3",
fillOpacity = 0.7,
highlight = highlightOptions(
weight = 3,
color = "#666",
dashArray = "",
fillOpacity = 0.7,
bringToFront = TRUE
),
label = ~ifelse(is.na(Average_Rank), paste(name, ": No Participation"), paste(name, ": Average Rank ", round(Average_Rank, 2))),
labelOptions = labelOptions(
style = list("font-weight" = "normal", padding = "3px 8px"),
textsize = "15px",
direction = "auto"
)
) %>%
addLegend(
pal = pal_rank,
values = ~Average_Rank,
opacity = 0.7,
title = "Average Ranking",
position = "bottomright"
)
map_rank